(Ended at using a scaled model)
(Need to construct pipeline)
(Need to eliminate sparse variables - dummies)
This draws from:
We'll explore the use of the K-nearest neighbor machine learning algorithm, an extremely simple machine learning algorithm, to fit our data, assess how well it performs, and compare its performance to other machine learning algorithms.
%matplotlib inline
# numbers, stats, plots
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
# sklearn support
from sklearn import metrics, cross_validation, preprocessing
from sklearn.datasets.base import Bunch
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import ShuffleSplit, StratifiedKFold
# machine learning algorithm of interest
from sklearn.neighbors import KNeighborsClassifier
#import pickle
Start by loading the dataset into a scikit learn Bundle
object:
def load_data():
# Load the data from this file
data_file = 'abalone/Dataset.data'
# x data labels
xnlabs = ['Sex']
xqlabs = ['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight']
xlabs = xnlabs + xqlabs
# y data labels
ylabs = ['Rings']
# Load data to dataframe
df = pd.read_csv(data_file, header=None, sep=' ', names=xlabs+ylabs)
# Filter zero values of height/length/diameter
df = df[df['Height']>0.0]
df = df[df['Length']>0.0]
df = df[df['Diameter']>0.0]
df['Volume'] = df['Height']*df['Length']*df['Diameter']
xqlabs.append('Volume')
dummies = pd.get_dummies(df[xnlabs], prefix='Sex')
dfdummies = df[xqlabs+ylabs].join(dummies)
xqlabs = xqlabs + dummies.columns.tolist()
return Bunch(data = dfdummies[xqlabs],
target = df[ylabs],
feature_names = xqlabs,
target_names = ylabs)
# Load the dataset
dataset = load_data()
X = dataset.data
y = dataset.target
print X.head()
print "-"*20
print y.head()
# Split into a training set and a test set
X_train, X_test, y_train, y_test = \
cross_validation.train_test_split(X, y, test_size=0.2)
We have loaded the data, and split it into a test set and a training set. Now we're ready to run the k-nearest neighbors algorithm on the result. (Note: once we've worked out the details, we'll create a pipeline to standardize and cross-validate.) Start with a k-nearest neighbors classifier model:
from sklearn.neighbors import KNeighborsClassifier
def make_spiffy(y):
# Turns an (N,1) array into an (N,) array (???)
return y.values.reshape(len(y.values))
knc = KNeighborsClassifier(n_neighbors=10)
knc.fit(X_train, make_spiffy(y_train))
print knc
# Make predictions
yhat_test = knc.predict(X_test)
# This is not usually a good way to assess categorical models,
# but in this case, we're guessing age, so the categories are quantitative.
print knc.score(X_test,y_test)
## Yikes. This model may not be worth saving.
#with open('logistic_regression.pickle', 'w') as f:
# pickle.dump(model, f)
fig = plt.figure(figsize=(7,7))
ax = fig.add_subplot(111)
sns.heatmap(metrics.confusion_matrix(y_test, yhat_test),
cmap="GnBu", square=True, ax=ax)
ax.set_title('Heatmap: Confusion Matrix for \nKNN Classifier Model')
ax.set_xlabel('Predicted Age')
ax.set_ylabel('Actual Age')
plt.show()
#print metrics.confusion_matrix(y_test, yhat_test)
print metrics.classification_report(y_test, yhat_test)
resid = make_spiffy(y_test) - yhat_test
print np.mean(resid)
print np.std(resid)
fig = plt.figure(figsize=(4,4))
ax = fig.add_subplot(111)
stats.probplot(resid, dist='norm', plot=ax)
plt.show()
With a score of 0.20, it's going to take a lot of work to get this model up to a level of accuracy comparable with, say, linear regression or state vector regression.
# Make a logistic regression model
knc = KNeighborsClassifier(n_neighbors=10)
# Make a ShuffleSplit object to split data into training/testing data sets randomly
cv = ShuffleSplit(n_splits=4, test_size=0.3, random_state=0)
# This will be our "model":
# a pipeline that scales our inputs first,
# then passes them to the logistic regression model
clf = make_pipeline(preprocessing.StandardScaler(), knc)
cross_val_score(clf, X, make_spiffy(y), cv=cv)
Now let's use this cross-validation pipeline to see if adding a volume term improves this model at all.
Now that we have a more quantitative way to assess our models, let's start adding in some factors to see if we can improve our logistic regression model.
def load_data_with_volume():
# Load the data from this file
data_file = 'abalone/Dataset.data'
# x data labels
xnlabs = ['Sex']
xqlabs = ['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight']
xlabs = xnlabs + xqlabs
# y data labels
ylabs = ['Rings']
# Load data to dataframe
df = pd.read_csv(data_file, header=None, sep=' ', names=xlabs+ylabs)
# Filter zero values of height/length/diameter
df = df[df['Height']>0.0]
df = df[df['Length']>0.0]
df = df[df['Diameter']>0.0]
# -----------------------------
# Add volume
df['Volume'] = df['Height']*df['Length']*df['Diameter']
xqlabs.append('Volume')
# Add dimensions squared
sq = lambda x : x*x
df['Height2'] = df['Height'].apply(sq)
df['Length2'] = df['Length'].apply(sq)
df['Diameter2'] = df['Diameter'].apply(sq)
xqlabs.append('Height2')
xqlabs.append('Length2')
xqlabs.append('Diameter2')
# Add interactions
df['Height-Length'] = df['Height']*df['Length']
df['Length-Diameter'] = df['Length']*df['Diameter']
df['Height-Diameter'] = df['Height']*df['Diameter']
xqlabs.append('Height-Length')
xqlabs.append('Length-Diameter')
xqlabs.append('Height-Diameter')
# Add dimensions cubed
cube = lambda x : x*x*x
df['Height3'] = df['Height'].apply(cube)
df['Length3'] = df['Length'].apply(cube)
df['Diameter3'] = df['Diameter'].apply(cube)
xqlabs.append('Height3')
xqlabs.append('Length3')
xqlabs.append('Diameter3')
# -----------------------------
dummies = pd.get_dummies(df[xnlabs], prefix='Sex')
dfdummies = df[xqlabs+ylabs].join(dummies)
xqlabs = xqlabs + dummies.columns.tolist()
return Bunch(data = dfdummies[xqlabs],
target = df[ylabs],
feature_names = xqlabs,
target_names = ylabs)
# Load the dataset
datasetV = load_data_with_volume()
XV = datasetV.data
yV = datasetV.target
Next we'll use the volume term we added to the data loading function to create a K-Nearest Neighbor model with volume included.
# Make a logistic regression model
knc = KNeighborsClassifier(n_neighbors=10)
# Make a ShuffleSplit object to split data into training/testing data sets randomly
cv = ShuffleSplit(n_splits=4, test_size=0.3, random_state=0)
# This will be our "model":
# a pipeline that scales our inputs first,
# then passes them to the logistic regression model
clf = make_pipeline(preprocessing.StandardScaler(), knc)
cross_val_score(clf, XV, make_spiffy(yV), cv=cv)